import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.lines as mlines
building_id: building code-name with the structure SiteID_SimplifiedUsage_UniqueName.site_id: animal-code-name for the site.building_id_kaggle: building ID used for the Kaggle competition (numeric).site_id_kaggle: site ID used for the Kaggle competition (numeric).primaryspaceusage: Primary space usage of all buildings is mapped using the energystar scheme building description types.sub_primaryspaceusage: energystar scheme building description types subcategory.sqm: Floor area of building in square meters (m2).lat: Latitude of building location to city level.lng: Longitude of building location to city level.timezone: site's timezone.electricity: presence of this kind of meter in the building. Yes if affirmative, NaN if negative.hotwater: presence of this kind of meter in the building. Yes if affirmative, NaN if negative.chilledwater: presence of this kind of meter in the building. Yes if affirmative, NaN if negative.steam: presence of this kind of meter in the building. Yes if affirmative, NaN if negative.water: presence of this kind of meter in the building. Yes if affirmative, NaN if negative.irrigation: presence of this kind of meter in the building. Yes if affirmative, NaN if negative.solar presence of this kind of meter in the building. Yes if affirmative, NaN if negative.gas: presence of this kind of meter in the building. Yes if affirmative, NaN if negative.industry: Industry type corresponding to building.subindustry: More detailed breakdown of Industry type corresponding to building.heatingtype: Type of heating in corresponding building.yearbuilt: Year corresponding to when building was first constructed, in the format YYYY.date_opened: Date building was opened for use, in the format D/M/YYYY.numberoffloors: Number of floors corresponding to building.occupants: Usual number of occupants in the building.energystarscore: Rating of building corresponding to building energystar scheme (Energy Star Score).eui: Energy use intensity of the building (kWh/year/m2).site_eui: Energy (Consumed/Purchased) use intensity of the site (kWh/year/m2).source_eui: Total primary energy consumption normalized by area (Takes into account conversion efficiency of primary energy into secondary energy).leed_level: LEED rating of the building (Leadership in Energy and Environmental Design), most widely used green building rating system.rating: Other building energy ratings.building = pd.read_csv("./data/building_metadata.csv")
building.head()
| building_id | site_id | building_id_kaggle | site_id_kaggle | primaryspaceusage | sub_primaryspaceusage | sqm | sqft | lat | lng | ... | yearbuilt | date_opened | numberoffloors | occupants | energystarscore | eui | site_eui | source_eui | leed_level | rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Panther_lodging_Dean | Panther | NaN | 0.0 | Lodging/residential | Residence Hall | 508.8 | 5477.0 | 28.517689 | -81.379039 | ... | 1989.0 | NaN | NaN | NaN | NaN | 271 | NaN | NaN | None | NaN |
| 1 | Panther_lodging_Shelia | Panther | NaN | 0.0 | Lodging/residential | Residence Hall | 929.0 | 10000.0 | 28.517689 | -81.379039 | ... | 1992.0 | NaN | NaN | NaN | NaN | 62 | NaN | NaN | None | NaN |
| 2 | Panther_lodging_Ricky | Panther | NaN | 0.0 | Lodging/residential | Residence Hall | 483.1 | 5200.0 | 28.517689 | -81.379039 | ... | 2016.0 | NaN | NaN | NaN | NaN | 534 | NaN | NaN | None | NaN |
| 3 | Panther_education_Rosalie | Panther | 0.0 | 0.0 | Education | Research | 690.5 | 7432.0 | 28.517689 | -81.379039 | ... | 2008.0 | NaN | NaN | NaN | NaN | 276 | NaN | NaN | None | NaN |
| 4 | Panther_education_Misty | Panther | 1.0 | 0.0 | Education | Research | 252.7 | 2720.0 | 28.517689 | -81.379039 | ... | 2004.0 | NaN | NaN | NaN | NaN | 375 | NaN | NaN | None | NaN |
5 rows × 32 columns
building.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1636 entries, 0 to 1635 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 building_id 1636 non-null object 1 site_id 1636 non-null object 2 building_id_kaggle 1449 non-null float64 3 site_id_kaggle 1599 non-null float64 4 primaryspaceusage 1615 non-null object 5 sub_primaryspaceusage 1615 non-null object 6 sqm 1636 non-null float64 7 sqft 1636 non-null float64 8 lat 1399 non-null float64 9 lng 1399 non-null float64 10 timezone 1636 non-null object 11 electricity 1578 non-null object 12 hotwater 185 non-null object 13 chilledwater 555 non-null object 14 steam 370 non-null object 15 water 146 non-null object 16 irrigation 37 non-null object 17 solar 5 non-null object 18 gas 177 non-null object 19 industry 579 non-null object 20 subindustry 579 non-null object 21 heatingtype 215 non-null object 22 yearbuilt 817 non-null float64 23 date_opened 21 non-null object 24 numberoffloors 441 non-null float64 25 occupants 230 non-null float64 26 energystarscore 163 non-null object 27 eui 299 non-null object 28 site_eui 163 non-null object 29 source_eui 163 non-null object 30 leed_level 136 non-null object 31 rating 184 non-null object dtypes: float64(9), object(23) memory usage: 409.1+ KB
building.isna().sum()/len(building)*100
building_id 0.000000 site_id 0.000000 building_id_kaggle 11.430318 site_id_kaggle 2.261614 primaryspaceusage 1.283619 sub_primaryspaceusage 1.283619 sqm 0.000000 sqft 0.000000 lat 14.486553 lng 14.486553 timezone 0.000000 electricity 3.545232 hotwater 88.691932 chilledwater 66.075795 steam 77.383863 water 91.075795 irrigation 97.738386 solar 99.694377 gas 89.180929 industry 64.608802 subindustry 64.608802 heatingtype 86.858191 yearbuilt 50.061125 date_opened 98.716381 numberoffloors 73.044010 occupants 85.941320 energystarscore 90.036675 eui 81.723716 site_eui 90.036675 source_eui 90.036675 leed_level 91.687042 rating 88.753056 dtype: float64
# Building without id or primary usage information is not usable
building = building[building["primaryspaceusage"].notna()]
building = building[building["site_id_kaggle"].notna()]
building = building.reset_index().drop("index", axis=1)
# Determining Continent Based on Longitude
print("North America ", len(building[building["lng"] < -30]))
print("Europe ", len(building[building["lng"] > -30]))
North America 1148 Europe 235
building["continent"] = ["north america" if x < -30 else "europe" for x in building["lng"]]
fig = px.scatter_geo(building,
lat=building["lat"].unique()[~np.isnan(building["lat"].unique())],
lon=building["lng"].unique()[~np.isnan(building["lng"].unique())],
title="Number of Building Based on Location",
hover_name=building["lat"].value_counts(),
color=building["lng"].unique()[~np.isnan(building["lng"].unique())] > -30,
color_discrete_sequence=["#7A97FA", "#EBC929"])
fig.update_layout(showlegend=False)
fig.update_traces(marker={"size":10})
fig.show()
building["primaryspaceusage"].unique()
array(['Lodging/residential', 'Education', 'Office',
'Entertainment/public assembly', 'Other', 'Retail', 'Parking',
'Public services', 'Warehouse/storage', 'Food sales and service',
'Religious worship', 'Healthcare', 'Utility', 'Technology/science',
'Manufacturing/industrial', 'Services'], dtype=object)
floor_mean = building.groupby("primaryspaceusage").mean()["numberoffloors"]
C:\Users\User\AppData\Local\Temp\ipykernel_9984\570472113.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
# Fill numberoffloors Column Based on Average of Same Primary Usage Buildings
for i in building["primaryspaceusage"].unique():
if pd.isna(floor_mean[i]):
building.loc[building["primaryspaceusage"] == i, "numberoffloors"] = building.loc[building["primaryspaceusage"] == i,
"numberoffloors"].fillna(round(floor_mean.mean()))
else:
building.loc[building["primaryspaceusage"] == i, "numberoffloors"] = building.loc[building["primaryspaceusage"] == i,
"numberoffloors"].fillna(round(floor_mean[i]))
building["numberoffloors"] = building["numberoffloors"].astype(int)
building.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 building_id 1599 non-null object 1 site_id 1599 non-null object 2 building_id_kaggle 1449 non-null float64 3 site_id_kaggle 1599 non-null float64 4 primaryspaceusage 1599 non-null object 5 sub_primaryspaceusage 1599 non-null object 6 sqm 1599 non-null float64 7 sqft 1599 non-null float64 8 lat 1383 non-null float64 9 lng 1383 non-null float64 10 timezone 1599 non-null object 11 electricity 1543 non-null object 12 hotwater 169 non-null object 13 chilledwater 535 non-null object 14 steam 370 non-null object 15 water 146 non-null object 16 irrigation 37 non-null object 17 solar 5 non-null object 18 gas 173 non-null object 19 industry 563 non-null object 20 subindustry 563 non-null object 21 heatingtype 199 non-null object 22 yearbuilt 784 non-null float64 23 date_opened 0 non-null object 24 numberoffloors 1599 non-null int32 25 occupants 221 non-null float64 26 energystarscore 163 non-null object 27 eui 299 non-null object 28 site_eui 163 non-null object 29 source_eui 163 non-null object 30 leed_level 136 non-null object 31 rating 168 non-null object 32 continent 1599 non-null object dtypes: float64(8), int32(1), object(24) memory usage: 406.1+ KB
building.head()
| building_id | site_id | building_id_kaggle | site_id_kaggle | primaryspaceusage | sub_primaryspaceusage | sqm | sqft | lat | lng | ... | date_opened | numberoffloors | occupants | energystarscore | eui | site_eui | source_eui | leed_level | rating | continent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Panther_lodging_Dean | Panther | NaN | 0.0 | Lodging/residential | Residence Hall | 508.8 | 5477.0 | 28.517689 | -81.379039 | ... | NaN | 6 | NaN | NaN | 271 | NaN | NaN | None | NaN | north america |
| 1 | Panther_lodging_Shelia | Panther | NaN | 0.0 | Lodging/residential | Residence Hall | 929.0 | 10000.0 | 28.517689 | -81.379039 | ... | NaN | 6 | NaN | NaN | 62 | NaN | NaN | None | NaN | north america |
| 2 | Panther_lodging_Ricky | Panther | NaN | 0.0 | Lodging/residential | Residence Hall | 483.1 | 5200.0 | 28.517689 | -81.379039 | ... | NaN | 6 | NaN | NaN | 534 | NaN | NaN | None | NaN | north america |
| 3 | Panther_education_Rosalie | Panther | 0.0 | 0.0 | Education | Research | 690.5 | 7432.0 | 28.517689 | -81.379039 | ... | NaN | 4 | NaN | NaN | 276 | NaN | NaN | None | NaN | north america |
| 4 | Panther_education_Misty | Panther | 1.0 | 0.0 | Education | Research | 252.7 | 2720.0 | 28.517689 | -81.379039 | ... | NaN | 4 | NaN | NaN | 375 | NaN | NaN | None | NaN | north america |
5 rows × 33 columns
# Get Footprint Area to Determine Sites with Small/Large Buildings
building["footprint_area"] = building["sqm"] / building["numberoffloors"]
fig = plt.subplots(figsize=(15,20))
grid = plt.GridSpec(nrows=4,
ncols=2)
ax1 = plt.subplot(grid[0, 0:])
ax2 = plt.subplot(grid[1,0])
ax3 = plt.subplot(grid[1,1])
ax4 = plt.subplot(grid[2,0])
ax5 = plt.subplot(grid[2,1])
ax6 = plt.subplot(grid[3, 0:])
# Areas of Buildings in Each Continent
sns.histplot(ax=ax1,
data=building,
x="sqm",
palette=["#7A97FA", "#F0CD75"],
kde=True,
hue="continent",
multiple="stack",
bins=30)
ax1.legend(labels=["Europe", "North America"],
title="Continet")
ax1.lines[0].set_color("#FA709F")
ax1.lines[1].set_color("#FA709F")
ax1.set(ylabel="Number of Buildings",
xlabel="Square Meters")
ax1.set_title("Building Area")
# Percentage of Building Primary Usage in Each Continent
top5 = list(building["primaryspaceusage"].value_counts().iloc[:5].index)
temp = building[["primaryspaceusage"]].copy()
temp.loc[temp["primaryspaceusage"].isin(top5) == False, "primaryspaceusage"] = "Other"
sns.countplot(ax=ax2,
data=temp,
y="primaryspaceusage",
palette=["#7A97FA", "#F0CD75"],
order=temp["primaryspaceusage"].value_counts().index,
hue=building["continent"])
ax2.legend(labels=["North America", "Europe"],
title="Continet")
ax2.set_title("Primary Category of Building Usage")
ax2.set(ylabel="",
xlabel="Number of Buildings",
xlim=(0, 550))
for patch in ax2.patches:
ax2.text(size=12,
x=patch.get_width()+30,
y=patch.get_y()+patch.get_height()/1.7,
s="---" + "{:.2f}".format(patch.get_width()/len(building)*100) + "%",
ha="center")
# Percentage of Building Primary Usage in Each Continent
top5 = list(building["sub_primaryspaceusage"].value_counts().iloc[:5].index)
temp = building[["sub_primaryspaceusage"]].copy()
temp.loc[temp["sub_primaryspaceusage"].isin(top5) == False, "sub_primaryspaceusage"] = "Other"
sns.countplot(ax=ax3,
data=temp,
y="sub_primaryspaceusage",
palette=["#7A97FA", "#F0CD75"],
order=temp["sub_primaryspaceusage"].value_counts().index,
hue=building["continent"])
ax3.legend(labels=["North America", "Europe"],
title="Continet")
ax3.set_title("Secondary Category of Building Usage")
ax3.set(xlabel="Number of Buildings",
ylabel="",
xlim=(0, 650))
for patch in ax3.patches:
ax3.text(size=12,
x=patch.get_width()+35,
y=patch.get_y() + patch.get_height()/1.7,
s="---" + "{:.2f}".format(patch.get_width()/len(building)*100) + "%",
ha="center")
# Building Footprint per Site
temp = []
for i in building["site_id_kaggle"].unique():
temp.append(building.loc[np.where(building["site_id_kaggle"] == i)]["footprint_area"])
sns.boxplot(ax=ax4,
data=temp,
color="#A3FFAF",
showfliers=False),
ax4.set_title("Building Footprint Per Site")
ax4.set(xlabel="Site ID")
# Building Floor Count per Site
temp = []
for i in building["site_id_kaggle"].unique():
temp.append(building.loc[np.where(building["site_id_kaggle"] == i)]["numberoffloors"])
sns.boxplot(ax=ax5,
data=temp,
color="#A3FFAF",
showfliers=False)
ax5.set_title("Building Floor Count Per Site")
ax5.set(yticks=(np.arange(0, 22, 1)),
xlabel="Site ID")
# Buildings Built per Year
sns.kdeplot(ax=ax6,
data=building["yearbuilt"],
color="#FA7095")
ax6.set_title("Buildings Built Per Year (50% Missing Values)")
ax6.set(xlabel="",
ylabel="Density")
plt.tight_layout()
plt.show()
C:\Users\User\AppData\Local\Temp\ipykernel_9984\542844492.py:6: MatplotlibDeprecationWarning: Auto-removal of overlapping axes is deprecated since 3.6 and will be removed two minor releases later; explicitly call ax.remove() as needed.
site_idair_temperature - Degrees Celsiuscloud_coverage - Portion of the sky covered in clouds, in oktasdew_temperature - Degrees Celsiusprecip_depth_1_hr - Millimeterssea_level_pressure - Millibar/hectopascalswind_direction - Compass direction (0-360)wind_speed - Meters per secondweather = pd.read_csv("./data/weather.csv")
weather.head()
| site_id | timestamp | air_temperature | cloud_coverage | dew_temperature | precip_depth_1_hr | sea_level_pressure | wind_direction | wind_speed | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2016-01-01 00:00:00 | 25.0 | 6.0 | 20.0 | NaN | 1019.7 | 0.0 | 0.0 |
| 1 | 0 | 2016-01-01 01:00:00 | 24.4 | NaN | 21.1 | -1.0 | 1020.2 | 70.0 | 1.5 |
| 2 | 0 | 2016-01-01 02:00:00 | 22.8 | 2.0 | 21.1 | 0.0 | 1020.2 | 0.0 | 0.0 |
| 3 | 0 | 2016-01-01 03:00:00 | 21.1 | 2.0 | 20.6 | 0.0 | 1020.1 | 0.0 | 0.0 |
| 4 | 0 | 2016-01-01 04:00:00 | 20.0 | 2.0 | 20.0 | -1.0 | 1020.0 | 250.0 | 2.6 |
weather.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 417016 entries, 0 to 417015 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 site_id 417016 non-null int64 1 timestamp 417016 non-null object 2 air_temperature 416857 non-null float64 3 cloud_coverage 207395 non-null float64 4 dew_temperature 416576 non-null float64 5 precip_depth_1_hr 271139 non-null float64 6 sea_level_pressure 385133 non-null float64 7 wind_direction 398378 non-null float64 8 wind_speed 416252 non-null float64 dtypes: float64(7), int64(1), object(1) memory usage: 28.6+ MB
weather.isna().sum()/len(weather)*100
site_id 0.000000 timestamp 0.000000 air_temperature 0.038128 cloud_coverage 50.266896 dew_temperature 0.105512 precip_depth_1_hr 34.981152 sea_level_pressure 7.645510 wind_direction 4.469373 wind_speed 0.183206 dtype: float64
# Missing Percentage of Features per Site
missing_percentage = weather.drop("site_id", axis=1).isna().groupby(weather["site_id"]).sum().apply(
lambda x: round(x / weather["site_id"].value_counts().sort_index() * 100))
missing_percentage["total_number"] = weather["site_id"].value_counts().sort_index()
missing_percentage
| timestamp | air_temperature | cloud_coverage | dew_temperature | precip_depth_1_hr | sea_level_pressure | wind_direction | wind_speed | total_number | |
|---|---|---|---|---|---|---|---|---|---|
| site_id | |||||||||
| 0 | 0.0 | 0.0 | 43.0 | 0.0 | 0.0 | 1.0 | 3.0 | 0.0 | 26304 |
| 1 | 0.0 | 0.0 | 80.0 | 0.0 | 100.0 | 0.0 | 0.0 | 0.0 | 26050 |
| 2 | 0.0 | 0.0 | 30.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 26303 |
| 3 | 0.0 | 0.0 | 44.0 | 0.0 | 0.0 | 2.0 | 2.0 | 0.0 | 26297 |
| 4 | 0.0 | 0.0 | 47.0 | 0.0 | 6.0 | 1.0 | 1.0 | 0.0 | 26299 |
| 5 | 0.0 | 0.0 | 67.0 | 0.0 | 100.0 | 100.0 | 3.0 | 0.0 | 25996 |
| 6 | 0.0 | 0.0 | 36.0 | 0.0 | 0.0 | 2.0 | 8.0 | 0.0 | 26286 |
| 7 | 0.0 | 0.0 | 100.0 | 0.0 | 90.0 | 0.0 | 0.0 | 0.0 | 25187 |
| 8 | 0.0 | 0.0 | 43.0 | 0.0 | 0.0 | 1.0 | 3.0 | 0.0 | 26304 |
| 9 | 0.0 | 0.0 | 43.0 | 1.0 | 0.0 | 3.0 | 30.0 | 1.0 | 26261 |
| 10 | 0.0 | 0.0 | 29.0 | 0.0 | 0.0 | 2.0 | 6.0 | 1.0 | 26258 |
| 11 | 0.0 | 0.0 | 100.0 | 0.0 | 90.0 | 0.0 | 0.0 | 0.0 | 25187 |
| 12 | 0.0 | 0.0 | 0.0 | 0.0 | 100.0 | 0.0 | 0.0 | 0.0 | 26034 |
| 13 | 0.0 | 0.0 | 50.0 | 0.0 | 2.0 | 1.0 | 3.0 | 0.0 | 26302 |
| 14 | 0.0 | 0.0 | 42.0 | 0.0 | 0.0 | 1.0 | 3.0 | 0.0 | 26293 |
| 15 | 0.0 | 0.0 | 56.0 | 0.0 | 77.0 | 6.0 | 3.0 | 0.0 | 25655 |
# Metric Correlation with Each Other
f = plt.figure(figsize=(8, 6))
sns.heatmap(data=weather.drop(["timestamp", "site_id"], axis=1).corr(),
annot=True,
cmap="coolwarm")
<Axes: >
weather.drop("timestamp", axis=1).groupby("site_id").mean()
| air_temperature | cloud_coverage | dew_temperature | precip_depth_1_hr | sea_level_pressure | wind_direction | wind_speed | |
|---|---|---|---|---|---|---|---|
| site_id | |||||||
| 0 | 22.744816 | 2.998727 | 17.379328 | 1.240589 | 1017.949437 | 152.521015 | 3.391472 |
| 1 | 12.015507 | 0.059266 | 7.578107 | NaN | 1015.757132 | 197.769373 | 4.013777 |
| 2 | 24.963702 | 2.083712 | 3.901814 | 0.155325 | 1011.436563 | 153.629430 | 2.899795 |
| 3 | 15.502518 | 4.079235 | 8.285693 | 0.895980 | 1018.052063 | 184.295922 | 3.905433 |
| 4 | 14.725362 | 1.844576 | 9.834399 | 0.440202 | 1016.797495 | 205.512890 | 3.788642 |
| 5 | 11.058629 | 0.388354 | 8.353095 | NaN | NaN | 206.606135 | 4.846472 |
| 6 | 15.204244 | 0.681089 | 7.684143 | 1.029495 | 1017.561461 | 129.106869 | 2.237193 |
| 7 | 7.572136 | NaN | 2.037952 | 11.675633 | 1015.596794 | 206.995037 | 3.390690 |
| 8 | 22.744816 | 2.998727 | 17.379328 | 1.240589 | 1017.949437 | 152.521015 | 3.391472 |
| 9 | 21.325261 | 0.592396 | 13.848085 | 0.775686 | 1016.404579 | 123.388701 | 2.157103 |
| 10 | 11.704354 | 0.338958 | -0.304675 | 0.353491 | 1015.508073 | 182.096604 | 3.101170 |
| 11 | 7.572136 | NaN | 2.037952 | 11.675633 | 1015.596794 | 206.995037 | 3.390690 |
| 12 | 9.959092 | 5.671785 | 6.693409 | NaN | 1014.017353 | 207.134076 | 5.493939 |
| 13 | 9.115628 | 2.778787 | 2.836979 | 0.808871 | 1016.426988 | 187.301104 | 4.112377 |
| 14 | 13.109654 | 0.665832 | 6.478977 | 1.033215 | 1016.608501 | 184.702135 | 3.100911 |
| 15 | 9.079138 | 1.092101 | 4.512823 | 3.381837 | 1017.192111 | 186.648787 | 3.619547 |
weather["timestamp"] = pd.to_datetime(weather["timestamp"])
weather.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 417016 entries, 0 to 417015 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 site_id 417016 non-null int64 1 timestamp 417016 non-null datetime64[ns] 2 air_temperature 416857 non-null float64 3 cloud_coverage 207395 non-null float64 4 dew_temperature 416576 non-null float64 5 precip_depth_1_hr 271139 non-null float64 6 sea_level_pressure 385133 non-null float64 7 wind_direction 398378 non-null float64 8 wind_speed 416252 non-null float64 dtypes: datetime64[ns](1), float64(7), int64(1) memory usage: 28.6 MB
weather["year"] = weather["timestamp"].dt.year
weather["month"] = weather["timestamp"].dt.month
weather["day"] = weather["timestamp"].dt.day
weather["time"] = weather["timestamp"].dt.time
weather["date"] = weather["timestamp"].dt.date
weather.drop("timestamp", axis=1, inplace=True)
weather.drop(["day", "time", "date"], axis=1).groupby(["site_id", "year", "month"]).mean()
| air_temperature | cloud_coverage | dew_temperature | precip_depth_1_hr | sea_level_pressure | wind_direction | wind_speed | |||
|---|---|---|---|---|---|---|---|---|---|
| site_id | year | month | |||||||
| 0 | 2016 | 1 | 14.714305 | 3.667774 | 9.211336 | 1.796770 | 1018.112466 | 196.702997 | 3.642876 |
| 2 | 16.139655 | 2.312073 | 8.870546 | 0.593391 | 1020.102457 | 197.967836 | 4.054741 | ||
| 3 | 21.266263 | 3.266272 | 14.429973 | 1.740591 | 1018.672666 | 174.499314 | 3.633602 | ||
| 4 | 22.431250 | 3.026906 | 14.591806 | 0.197222 | 1017.275145 | 142.922636 | 3.759306 | ||
| 5 | 24.734274 | 2.764957 | 17.237097 | 1.215054 | 1016.178997 | 157.332402 | 3.219758 | ||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15 | 2018 | 8 | 21.362450 | 1.160622 | 18.272005 | 6.606557 | 1016.772147 | 159.168975 | 2.497174 |
| 9 | 17.971449 | 1.196364 | 15.218987 | 11.818182 | 1021.730233 | 142.583942 | 2.655837 | ||
| 10 | 9.532838 | 1.474178 | 6.563599 | 4.928934 | 1017.237193 | 205.801105 | 3.675639 | ||
| 11 | 1.624784 | 0.914530 | -1.331556 | 2.246753 | 1017.016743 | 195.087977 | 3.906494 | ||
| 12 | -0.365629 | 0.500000 | -2.878890 | 1.110701 | 1018.815710 | 190.013680 | 3.731935 |
576 rows × 7 columns
monthly_average_weather = weather.drop(["day", "time", "date"], axis=1).groupby(["site_id", "year", "month"]).mean()
monthly_average_weather.loc[0, 2016]["air_temperature"]
month 1 14.714305 2 16.139655 3 21.266263 4 22.431250 5 24.734274 6 27.366389 7 28.552823 8 27.613575 9 26.871944 10 24.038172 11 20.055417 12 19.956989 Name: air_temperature, dtype: float64
# Function for Plotting Each Year's Metric Sorted by Site
def plot_each_year(column, title):
fig = make_subplots(rows=len(weather["site_id"].unique()),
cols=1,
start_cell="top-left",
shared_xaxes=True,
row_titles=[building[building["site_id_kaggle"] == i].iloc[0]["site_id"] for i in weather["site_id"].unique()])
name_set = []
for j in weather["year"].unique():
for i in weather["site_id"].unique():
fig.add_trace(go.Scatter(x=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
y=monthly_average_weather.loc[i, j][column],
name=str(j),
legendgroup=str(j),
showlegend=True if j not in name_set else False,
line=dict(color="#7A97FA" if j==2016 else ("#FA7095" if j==2017 else "#70E680"))),
row=i+1, col=1)
if j not in name_set:
name_set.append(j)
fig.update_layout(title_text=title,
autosize=False,
width=900,
height=2000)
fig.show()
plot_each_year(column="air_temperature",
title="Average Air Temperature per Month each Year (c)")
plot_each_year(column="cloud_coverage",
title="Average Cloud Coverage per Month each Year (oktas)")
plot_each_year(column="wind_speed",
title="Average Wind Speed per Month each Year (m/s)")
plot_each_year(column="precip_depth_1_hr",
title="Average Precipitation per Month each Year (mm)")
# Plot Distribution of Each Metric by Site
fig = make_subplots(rows=4,
cols=1,
start_cell="top-left",
shared_xaxes=True)
for n, i in enumerate(weather.drop(["site_id", "year", "month", "day", "time", "date", "precip_depth_1_hr", "sea_level_pressure", "wind_direction"],
axis=1).columns):
fig.add_trace(go.Box(x=weather["site_id"],
y=weather[i],
name=i,
boxpoints=False),
row=n+1, col=1)
fig.update_layout(title_text="Distribution of Different Weather Conditions by Site",
autosize=False,
width=900,
height=1000)
fig.show()